rm(list = ls())
library(tidyverse)
ldt <- read_csv("../data/winter-elp.csv", show_col_types = FALSE)
glimpse(ldt)
## Rows: 12
## Columns: 3
## $ word <chr> "thing", "life", "door", "angel", "beer", "disgrace", "kitten", "…
## $ freq <dbl> 55522, 40629, 14895, 3992, 3850, 409, 241, 238, 66, 32, 4, 4
## $ rt <dbl> 621.77, 519.56, 507.38, 636.56, 587.18, 705.00, 611.26, 794.35, 7…
ldt %>%
ggplot(aes(x = freq, y = rt, label = word)) +
geom_point() +
geom_text(nudge_y = -10)
some terminology:
log-scaling:
ldt %>%
ggplot(aes(x = log(freq), y = rt, label = word)) +
geom_point() +
geom_text(nudge_y = -10) +
geom_smooth(formula = y ~ x, method = 'lm', se = FALSE)
\[ \text{rt} \approx b_0 + b_1\cdot \log(\text{freq}) \]
(model <- lm(rt ~ log(freq), data = ldt))
##
## Call:
## lm(formula = rt ~ log(freq), data = ldt)
##
## Coefficients:
## (Intercept) log(freq)
## 870.91 -30.52
lm(rt ~ 1 + log(freq), data = ldt)
##
## Call:
## lm(formula = rt ~ 1 + log(freq), data = ldt)
##
## Coefficients:
## (Intercept) log(freq)
## 870.91 -30.52
interpretation:
Q: How long would you expect the response time to be for a word that appears 100,000 times? How long for one that appears \(10^{15}\) times? How long for one that appears 0 times? How long for one that appears -1000 times?
\[ \text{rt} = b_0 + b_1\cdot \log(\text{freq}) + \varepsilon \]
model$residuals
## 1 2 3 4 5 6 7 8
## 84.28883 -27.45269 -70.25887 18.73354 -31.75190 17.63728 -92.24566 90.46203
## 9 10 11 12
## -17.99428 44.74122 -65.09475 48.93525
plot(model)
the first two plots are the most important ones
here it looks like a good model, since both assumptions seem to be met
NB: we can also check most important descriptive figures of distribution of residuals in model summary:
summary(model)
##
## Call:
## lm(formula = rt ~ log(freq), data = ldt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -92.246 -40.088 -0.179 45.790 90.462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 870.90 40.42 21.548 1.03e-09 ***
## log(freq) -30.52 5.76 -5.299 0.000348 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63.34 on 10 degrees of freedom
## Multiple R-squared: 0.7374, Adjusted R-squared: 0.7111
## F-statistic: 28.08 on 1 and 10 DF, p-value: 0.0003482
(model.null <- lm(rt ~ 1, data = ldt))
##
## Call:
## lm(formula = rt ~ 1, data = ldt)
##
## Coefficients:
## (Intercept)
## 679.9
mean(ldt$rt)
## [1] 679.9167
sum(model$residuals ** 2)
## [1] 40114.97
sum(model.null$residuals ** 2)
## [1] 152743.2
\[R^2:=1-\dfrac{\text{SSE}_\text{model}}{\text{SSE}_\text{null}}\]
1 - sum(model$residuals ** 2) / sum(model.null$residuals ** 2)
## [1] 0.7373698
cor(log(ldt$freq), ldt$rt) ** 2
## [1] 0.7373698
summary(model)$r.squared
## [1] 0.7373698
summary(model)$sigma
## [1] 63.33638
# just the mean of the squared residuals (taking into account the actual degrees of freedom)
sqrt(sum(model$residuals ** 2) / (length(model$residuals) - length(model$coefficients)))
## [1] 63.33638
model$coefficients
## (Intercept) log(freq)
## 870.90539 -30.52068
model$fitted.values
## 1 2 3 4 5 6 7 8
## 537.4812 547.0127 577.6389 617.8265 618.9319 687.3627 703.5057 703.8880
## 9 10 11 12
## 743.0343 765.1288 828.5947 828.5947
predict(model, ldt)
## 1 2 3 4 5 6 7 8
## 537.4812 547.0127 577.6389 617.8265 618.9319 687.3627 703.5057 703.8880
## 9 10 11 12
## 743.0343 765.1288 828.5947 828.5947
explanatory_data <- tibble(
freq = c(-500, 1.2, 10000, 10**100)
)
predict(model, explanatory_data)
## Warning in log(freq): NaNs produced
## 1 2 3 4
## NaN 865.3408 589.7995 -6156.7408
summary(model)
##
## Call:
## lm(formula = rt ~ log(freq), data = ldt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -92.246 -40.088 -0.179 45.790 90.462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 870.90 40.42 21.548 1.03e-09 ***
## log(freq) -30.52 5.76 -5.299 0.000348 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63.34 on 10 degrees of freedom
## Multiple R-squared: 0.7374, Adjusted R-squared: 0.7111
## F-statistic: 28.08 on 1 and 10 DF, p-value: 0.0003482
log-scaling often useful, changes model fit
other transformation possible, e.g. polynomial
linear transformations, on the other hand, do not change model fit
it is often recommended to centre or standardise variables
goal: resulting variables are more easily interpretable
ldt <- ldt %>%
mutate(log_freq = log(freq),
log_freq_z = (log_freq - mean(log_freq)) / sd(log_freq),
rt_z = (rt - mean(rt)) / sd(rt))
ldt %>%
ggplot(aes(x = log_freq, y = rt, label = word)) +
geom_point() +
geom_text(nudge_y = -10) +
geom_smooth(formula = y ~ x, method = 'lm', se = FALSE)
ldt %>%
ggplot(aes(x = log_freq_z, y = rt, label = word)) +
geom_point() +
geom_text(nudge_y = -10) +
geom_smooth(formula = y ~ x, method = 'lm', se = FALSE)
ldt %>%
ggplot(aes(x = log_freq_z, y = rt_z, label = word)) +
geom_point() +
geom_text(nudge_y = -.1) +
geom_smooth(formula = y ~ x, method = 'lm', se = FALSE)
model1 <- lm(rt ~ log_freq, data = ldt) # same as original model
model2 <- lm(rt ~ log_freq_z, data = ldt)
model3 <- lm(rt_z ~ log_freq_z, data = ldt)
summary(model1)$r.squared
## [1] 0.7373698
summary(model2)$r.squared
## [1] 0.7373698
summary(model3)$r.squared
## [1] 0.7373698
model1$coefficients
## (Intercept) log_freq
## 870.90539 -30.52068
model2$coefficients
## (Intercept) log_freq_z
## 679.9167 -101.1876
model3$coefficients
## (Intercept) log_freq_z
## 1.121728e-16 -8.587024e-01
lm() and summary() not
straightforwardbroom for tidy formatsbroom::glance(model)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.737 0.711 63.3 28.1 0.000348 1 -65.7 137. 139.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
broom::tidy(model)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 871. 40.4 21.5 0.00000000103
## 2 log(freq) -30.5 5.76 -5.30 0.000348
broom::augment(model)
## # A tibble: 12 × 7
## rt `log(freq)` .fitted .hat .sigma .cooksd .std.resid
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 622. 10.9 537. 0.263 58.2 0.430 1.55
## 2 520. 10.6 547. 0.240 65.9 0.0391 -0.497
## 3 507. 9.61 578. 0.176 61.6 0.160 -1.22
## 4 637. 8.29 618. 0.118 66.4 0.00660 0.315
## 5 587. 8.26 619. 0.116 65.8 0.0187 -0.533
## 6 705 6.01 687. 0.0838 66.5 0.00387 0.291
## 7 611. 5.48 704. 0.0883 58.5 0.113 -1.53
## 8 794. 5.47 704. 0.0884 58.8 0.109 1.50
## 9 725. 4.19 743. 0.119 66.5 0.00617 -0.303
## 10 810. 3.47 765. 0.148 64.8 0.0508 0.765
## 11 764. 1.39 829. 0.280 61.7 0.285 -1.21
## 12 878. 1.39 829. 0.280 63.9 0.161 0.910
# whole data set
ldt.c <- read_csv("../data/winter-elp-complete.csv", show_col_types = FALSE)
stops <- stopwords::stopwords() # English stopwords
ldt.c <- ldt.c %>% mutate(stop = factor(word %in% stops))
ldt.c %>% ggplot(aes(x = stop, y = rt, fill = stop)) +
geom_boxplot()
ldt.c %>% ggplot(aes(x = log(freq), fill = stop)) +
geom_density(alpha = .5)
ldt.c %>% ggplot(aes(x = log(freq), y = rt, color = stop)) +
geom_point()
remember easiest form of linear model: \[ y = b_0 + b_1\cdot x + \varepsilon \]
obviously, does not work if \(x\) is categorical
however, we can just use a dummy variable, coding the factor as a metric variable
here: treatment coding: we replace one of the levels as 0 (the reference level), the other one as 1
model.stop <- lm(rt ~ stop, data = ldt.c)
ldt.c %>% group_by(stop) %>% summarise(mean_rt = mean(rt))
## # A tibble: 2 × 2
## stop mean_rt
## <fct> <dbl>
## 1 FALSE 770.
## 2 TRUE 605.
summary(model.stop)
##
## Call:
## lm(formula = rt ~ stop, data = ldt.c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -355.08 -90.52 -17.29 71.01 897.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 770.0768 0.6838 1126.196 <2e-16 ***
## stopTRUE -165.1377 19.4213 -8.503 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 124.3 on 33073 degrees of freedom
## Multiple R-squared: 0.002181, Adjusted R-squared: 0.002151
## F-statistic: 72.3 on 1 and 33073 DF, p-value: < 2.2e-16
plot(model.stop)
\[ \text{rt} = b_0 + b_1\cdot \log(\text{freq}) + \varepsilon \]
ldt.c %>% ggplot(aes(x = log(freq), y = rt)) +
geom_point() +
geom_smooth(formula = y ~ x, method = 'lm', se = FALSE)
model.freq <- lm(rt ~ log(freq), data = ldt.c)
broom::tidy(model.freq)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 907. 1.09 828. 0
## 2 log(freq) -37.5 0.262 -143. 0
broom::glance(model.freq)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.383 0.383 97.7 20566. 0 1 -198475. 396956. 396981.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
plot(model.freq)
\[ \text{rt} = b_0 + b_1\cdot \text{length} + \varepsilon \]
ldt.c %>% ggplot(aes(x = length, y = rt)) +
geom_point() +
geom_smooth(formula = y ~ x, method = 'lm', se = FALSE)
model.length <- lm(rt ~ length, data = ldt.c)
broom::tidy(model.length)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 572. 1.82 314. 0
## 2 length 29.8 0.260 115. 0
broom::glance(model.length)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.284 0.284 105. 13112. 0 1 -200949. 401905. 401930.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
plot(model.length)
summary(lm(rt ~ log(freq) + length, data = ldt.c))
##
## Call:
## lm(formula = rt ~ log(freq) + length, data = ldt.c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -410.24 -60.52 -9.71 48.01 705.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 748.4261 2.1761 343.92 <2e-16 ***
## log(freq) -29.5418 0.2579 -114.54 <2e-16 ***
## length 19.4583 0.2377 81.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 89.09 on 33072 degrees of freedom
## Multiple R-squared: 0.4873, Adjusted R-squared: 0.4873
## F-statistic: 1.572e+04 on 2 and 33072 DF, p-value: < 2.2e-16
ldt.c <- ldt.c %>% mutate(log_freq = log(freq),
log_freq_z = (log_freq - mean(log_freq) / sd(log_freq)),
length_c = (length - mean(length)))
model.freq.length <- lm(rt ~ log_freq_z + length_c, data = ldt.c)
summary(model.freq.length)
##
## Call:
## lm(formula = rt ~ log_freq_z + length_c, data = ldt.c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -410.24 -60.52 -9.71 48.01 705.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 825.0849 0.6872 1200.58 <2e-16 ***
## log_freq_z -29.5418 0.2579 -114.54 <2e-16 ***
## length_c 19.4583 0.2377 81.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 89.09 on 33072 degrees of freedom
## Multiple R-squared: 0.4873, Adjusted R-squared: 0.4873
## F-statistic: 1.572e+04 on 2 and 33072 DF, p-value: < 2.2e-16
broom::glance(model.freq.length)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.487 0.487 89.1 15717. 0 2 -195424. 390856. 390889.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
broom::tidy(model.freq.length)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 825. 0.687 1201. 0
## 2 log_freq_z -29.5 0.258 -115. 0
## 3 length_c 19.5 0.238 81.9 0
model.length.stop <- lm(rt ~ length_c + stop, data = ldt.c)
summary(model.length.stop)
##
## Call:
## lm(formula = rt ~ length_c + stop, data = ldt.c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -306.31 -75.69 -13.93 61.26 768.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 769.9417 0.5792 1329.369 < 2e-16 ***
## length_c 29.7198 0.2604 114.136 < 2e-16 ***
## stopTRUE -56.2101 16.4778 -3.411 0.000647 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 105.3 on 33072 degrees of freedom
## Multiple R-squared: 0.2842, Adjusted R-squared: 0.2841
## F-statistic: 6564 on 2 and 33072 DF, p-value: < 2.2e-16
ldt.c %>% ggplot(aes(length_c, rt, color = stop)) +
geom_point(alpha = .5) +
moderndive::geom_parallel_slopes(se = F)
model.length.stop.inter <- lm(rt ~ length_c * stop, data = ldt.c)
summary(model.length.stop.inter)
##
## Call:
## lm(formula = rt ~ length_c * stop, data = ldt.c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -306.31 -75.69 -13.93 61.27 768.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 769.9417 0.5792 1329.390 <2e-16 ***
## length_c 29.7246 0.2604 114.147 <2e-16 ***
## stopTRUE -163.3215 75.9931 -2.149 0.0316 *
## length_c:stopTRUE -29.2654 20.2691 -1.444 0.1488
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 105.3 on 33071 degrees of freedom
## Multiple R-squared: 0.2842, Adjusted R-squared: 0.2841
## F-statistic: 4377 on 3 and 33071 DF, p-value: < 2.2e-16
ldt.c %>% ggplot(aes(length_c, rt, color = stop)) +
geom_point(alpha = .5) +
geom_smooth(method = 'lm', se = F)
## `geom_smooth()` using formula 'y ~ x'
ldt.c <- ldt.c %>% mutate(length_char = str_length(word),
length_char_c = length_char - mean(length_char))
ldt.c %>% ggplot(aes(x = length, y = length_char_c)) + geom_point() +
geom_smooth(formula = y ~ x, method = 'lm', se = FALSE)
model.length.length <- lm(rt ~ length_c + length_char_c, data = ldt.c)
rbind(
broom::glance(model.length),
broom::glance(model.length.length)
)
## # A tibble: 2 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.284 0.284 105. 13112. 0 1 -200949. 401905. 401930.
## 2 0.302 0.302 104. 7159. 0 2 -200523. 401054. 401087.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
rbind(
broom::tidy(model.length),
broom::tidy(model.length.length)
)
## # A tibble: 5 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 572. 1.82 314. 0
## 2 length 29.8 0.260 115. 0
## 3 (Intercept) 770. 0.572 1347. 0
## 4 length_c 12.9 0.629 20.5 1.28e- 92
## 5 length_char_c 17.2 0.586 29.4 1.53e-187
adding another predictor has not given us much gain in \(R^2\) (also look at adjusted \(R^2\))
it did however change the coefficient for the length drastically, changing the interpretation!
we can use variance inflation factors (vif) to check if we are dealing with collinearity
vif should be close to one; it should start worrying you when it’s above 5
car::vif(model.length.stop)
## length_c stop
## 1.003366 1.003366
car::vif(model.length.length)
## length_c length_char_c
## 6.008984 6.008984
plot(model.length.stop)